Environment Variables and Packages

options(java.parameters = "-Xmx2048m",
        stringsAsFactors = FALSE, 
        encoding = 'UTF-8')

suppressPackageStartupMessages({
  # ISLR2
  library(ISLR2)
  # Schmidt
  library(far)
  # kernel density
  library(ks)
  # DM
  library(zip)
  library(openxlsx)
  library(readxl)
  library(writexl)
  library(RcppRoll)
  library(plyr)
  library(stringi)
  library(feather)
  library(RODBC)
  library(MASS)
  library(car)
  library(data.table)
  library(lubridate)
  library(plotly)
  library(pROC)
  library(tidymodels)
  library(tidyverse)
})

Auto

environment variables.

color.origin <- c('1' = '#19d3f3', '2' = '#FFB6C1', '3' = '#00FF00')

Scatter-plot matrix, or splom.

auto.splom <- plot_ly(ISLR2::Auto, 
                      color = as.integer(ISLR2::Auto$origin), 
                      colors = color.origin) %>% 
  add_trace(
    type = 'splom',
    dimensions = list(
      list(label = 'mpg', values = ~mpg), 
      list(label = 'cyl', values = ~cylinders), 
      list(label = 'displace', values = ~displacement), 
      list(label = 'hp', values = ~horsepower), 
      list(label = 'weight', values = ~weight), 
      list(label = 'acc', values = ~acceleration), 
      list(label = 'year', values = ~year), 
      list(label = 'origin', values = ~origin)
    ), 
    text = ~origin, 
    marker = list(
      size = 3
    )
  ) %>% 
  hide_colorbar() %>% 
  layout(
    title = 'Scatter-plot Matrix of Auto', 
    hovermode = 'closest', 
    dragmode = 'select', 
    plot_bgcolor = 'rgba(240,240,240, 0.95)', 
    autosize = TRUE, 
    xaxis = list(domain = NULL, showline = F, zeroline = F, gridcolor = '#ffff', ticklen = 2), 
    yaxis=list(domain = NULL, showline = F, zeroline = F, gridcolor = '#ffff', ticklen = 2)
  )

auto.splom

Correlation matrix.

cor(ISLR2::Auto[, 1:8])
##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
##              acceleration       year     origin
## mpg             0.4233285  0.5805410  0.5652088
## cylinders      -0.5046834 -0.3456474 -0.5689316
## displacement   -0.5438005 -0.3698552 -0.6145351
## horsepower     -0.6891955 -0.4163615 -0.4551715
## weight         -0.4168392 -0.3091199 -0.5850054
## acceleration    1.0000000  0.2903161  0.2127458
## year            0.2903161  1.0000000  0.1815277
## origin          0.2127458  0.1815277  1.0000000

Multivariate linear regression.

auto.dum <- ISLR2::Auto[, 1:8] %>% 
  mutate(American = if_else(origin == 1, 1, 0), 
         European = if_else(origin == 2, 1, 0)) %>% 
  select(-origin)

auto.lr <- lm(mpg ~ ., data = auto.dum)
summary(auto.lr)
## 
## Call:
## lm(formula = mpg ~ ., data = auto.dum)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.0095 -2.0785 -0.0982  1.9856 13.3608 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.510e+01  4.681e+00  -3.226  0.00136 ** 
## cylinders    -4.897e-01  3.212e-01  -1.524  0.12821    
## displacement  2.398e-02  7.653e-03   3.133  0.00186 ** 
## horsepower   -1.818e-02  1.371e-02  -1.326  0.18549    
## weight       -6.710e-03  6.551e-04 -10.243  < 2e-16 ***
## acceleration  7.910e-02  9.822e-02   0.805  0.42110    
## year          7.770e-01  5.178e-02  15.005  < 2e-16 ***
## American     -2.853e+00  5.527e-01  -5.162 3.93e-07 ***
## European     -2.232e-01  5.661e-01  -0.394  0.69355    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.307 on 383 degrees of freedom
## Multiple R-squared:  0.8242, Adjusted R-squared:  0.8205 
## F-statistic: 224.5 on 8 and 383 DF,  p-value: < 2.2e-16

Linear regression diagnostic plot.

par(mfrow = c(2, 2))
plot(auto.lr)

Stepwise regression.

auto.lr.inter <- lm(mpg ~ 1, data = auto.dum)
auto.lr.both <- stats::step(auto.lr.inter, scope = formula(auto.lr), direction = 'both', trace = 0)
summary(auto.lr.both)
## 
## Call:
## lm(formula = mpg ~ weight + year + American + displacement + 
##     horsepower + cylinders, data = auto.dum)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.0599 -2.0866 -0.1227  1.9076 13.5115 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -13.839999   4.113203  -3.365 0.000843 ***
## weight        -0.006505   0.000564 -11.534  < 2e-16 ***
## year           0.777686   0.050646  15.355  < 2e-16 ***
## American      -2.751258   0.481795  -5.710 2.26e-08 ***
## displacement   0.023493   0.007598   3.092 0.002133 ** 
## horsepower    -0.024557   0.010705  -2.294 0.022329 *  
## cylinders     -0.496154   0.319883  -1.551 0.121712    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.301 on 385 degrees of freedom
## Multiple R-squared:  0.8238, Adjusted R-squared:  0.8211 
## F-statistic: 300.1 on 6 and 385 DF,  p-value: < 2.2e-16

weight, year, American.

auto.lr.both3 <- lm(mpg ~ weight + year + American, data = auto.dum)
summary(auto.lr.both3)
## 
## Call:
## lm(formula = mpg ~ weight + year + American, data = auto.dum)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4841 -2.1141 -0.0192  1.7795 13.6261 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.639e+01  3.921e+00  -4.180 3.60e-05 ***
## weight      -5.893e-03  2.593e-04 -22.731  < 2e-16 ***
## year         7.725e-01  4.823e-02  16.017  < 2e-16 ***
## American    -2.095e+00  4.361e-01  -4.804 2.22e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.334 on 388 degrees of freedom
## Multiple R-squared:  0.8189, Adjusted R-squared:  0.8176 
## F-statistic:   585 on 3 and 388 DF,  p-value: < 2.2e-16

Successive orthogonalization.

auto.ortho <- auto.dum[, -1] %>% 
  mutate(x0 = 1, .before = 1) %>% 
  as.matrix() %>% 
  far::orthonormalization(basis = FALSE, norm = FALSE)

auto.lr.ortho.coef <- c()
for (i in 2:ncol(auto.ortho)) {
  auto.lr.res <- lm(auto.dum$mpg ~ auto.ortho[, i])
  auto.lr.ortho.coef[names(auto.dum)[i]] <- auto.lr.res$coefficients[2]
}

sort(auto.lr.ortho.coef)
##    cylinders     American     European   horsepower displacement acceleration 
## -3.558078368 -2.746947282 -0.223225868 -0.059935368 -0.051118499 -0.029104714 
##       weight         year 
## -0.005277173  0.753367180

cylinders, American, year.

auto.lr.ortho3 <- lm(mpg ~ cylinders + American + year, data = auto.dum)
summary(auto.lr.ortho3)
## 
## Call:
## lm(formula = mpg ~ cylinders + American + year, data = auto.dum)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.0529  -2.5583  -0.2539   2.2336  14.5046 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -20.85015    4.79913  -4.345 1.78e-05 ***
## cylinders    -2.44655    0.15958 -15.331  < 2e-16 ***
## American     -3.03313    0.53189  -5.703 2.34e-08 ***
## year          0.78415    0.05908  13.274  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.017 on 388 degrees of freedom
## Multiple R-squared:  0.7371, Adjusted R-squared:  0.7351 
## F-statistic: 362.6 on 3 and 388 DF,  p-value: < 2.2e-16

Hitters

High salary.

plot(sort(ISLR2::Hitters$Salary))

Set threshold as 500.

hitters.class <- ISLR2::Hitters %>% 
  filter(!is.na(Salary)) %>% 
  mutate(Salary = if_else(Salary > 500, 1, 0), 
         League = if_else(League == 'A', 1, 0), 
         Division = if_else(Division == 'E', 1, 0), 
         NewLeague = if_else(NewLeague == 'A', 1, 0))

Stepwise.

hitters.lr.inter <- lm(Salary ~ 1, data = hitters.class)
hitters.lr.all <- lm(Salary ~ ., data = hitters.class)
hitters.lr.both <- stats::step(hitters.lr.inter, scope = formula(hitters.lr.all), direction = 'both', trace = 0)
summary(hitters.lr.both)
## 
## Call:
## lm(formula = Salary ~ CHits + Hits + Division + NewLeague + Walks, 
##     data = hitters.class)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.97970 -0.26103 -0.05952  0.31275  1.33527 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.447e-01  7.008e-02  -3.491 0.000566 ***
## CHits        3.092e-04  3.834e-05   8.063 2.83e-14 ***
## Hits         3.219e-03  6.590e-04   4.885 1.82e-06 ***
## Division     1.114e-01  4.771e-02   2.334 0.020372 *  
## NewLeague   -1.072e-01  4.790e-02  -2.239 0.026023 *  
## Walks        2.505e-03  1.375e-03   1.822 0.069630 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3853 on 257 degrees of freedom
## Multiple R-squared:  0.4067, Adjusted R-squared:  0.3951 
## F-statistic: 35.23 on 5 and 257 DF,  p-value: < 2.2e-16

CHits, Hits.

Linear regression.

hitters.lr <- lm(Salary ~ CHits + Hits, data = hitters.class)
summary(hitters.lr)
## 
## Call:
## lm(formula = Salary ~ CHits + Hits, data = hitters.class)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.97748 -0.26241 -0.05775  0.31725  1.21810 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.249e-01  6.447e-02  -3.488 0.000571 ***
## CHits        3.233e-04  3.861e-05   8.375 3.47e-15 ***
## Hits         3.869e-03  5.546e-04   6.978 2.49e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3937 on 260 degrees of freedom
## Multiple R-squared:  0.3734, Adjusted R-squared:  0.3686 
## F-statistic: 77.48 on 2 and 260 DF,  p-value: < 2.2e-16

Logistic regression.

hitters.logistic <- glm(Salary ~ CHits + Hits, family = binomial(link = 'logit'), data = hitters.class)
summary(hitters.logistic)
## 
## Call:
## glm(formula = Salary ~ CHits + Hits, family = binomial(link = "logit"), 
##     data = hitters.class)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4942  -0.6542  -0.3463   0.7403   2.9237  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.3009111  0.5536668  -7.768 7.97e-15 ***
## CHits        0.0020230  0.0003179   6.364 1.97e-10 ***
## Hits         0.0226544  0.0040287   5.623 1.87e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 358.79  on 262  degrees of freedom
## Residual deviance: 241.51  on 260  degrees of freedom
## AIC: 247.51
## 
## Number of Fisher Scoring iterations: 5

Accurate rate.

hitters.pred.logistic <- hitters.class %>% 
  mutate(p = hitters.logistic$fitted.values, 
         Salary_p = if_else(p > 0.5, 1, 0), 
         acc = if_else(Salary == Salary_p, 1, 0))

(hitters.logistic.acc <- sum(hitters.pred.logistic$acc) / nrow(hitters.pred.logistic))
## [1] 0.8136882
hitters.pred.lr <- hitters.class %>% 
  mutate(pred = hitters.lr$fitted.values, 
         Salary_p = round(pred), 
         acc = if_else(Salary == Salary_p, 1, 0))

(hitters.lr.acc <- sum(hitters.pred.lr$acc) / nrow(hitters.pred.lr))
## [1] 0.8174905

Kernel density estimation.

hitters.error.select <- hitters.pred.logistic %>% 
  filter(acc == 0) %>% 
  select(CHits, Hits)

hitters.kde <- ks::kde(x = hitters.error.select, verbose = TRUE)
## ================================================================================

Plot kernel density estimation.

plot(hitters.kde, display = 'slice', cont = c(25, 50, 75, 100))